In this project I am answering the following questions:
See here for other iPython notebooks on this project.
Project (datasets and the source code) is available on GitHub
The news and the curated tweets used in this study are scraped from theplazz.com approximately matching the duration of 113th US Congress, i.e. between Jan 2013 - Jan 2015. Here is an annotated screenshot of one of the news published on this news media site:
In [1]:
cd ..
In [2]:
import twitter
import pandas as pd
import numpy as np
import plotly.plotly as py
from plotly.graph_objs import *
from mykeys import tw
import networkx as nx
import itertools
from collections import Counter
%matplotlib inline
In [3]:
def oauth_login():
"""Twitter authorization """
#tw is a dictionary, the only variable in mykeys.py
auth = twitter.oauth.OAuth(tw['OAUTH_TOKEN'], tw['OAUTH_TOKEN_SECRET'],
tw['CONSUMER_KEY'], tw['CONSUMER_SECRET'])
twitter_api = twitter.Twitter(auth=auth)
return twitter_api
def get_members(members):
"""Scrape only the interesting info from twitter json response """
return [(m['id'],m['screen_name'],m['name'],m['location'],m['description'],
m['created_at'], m['friends_count'],m['followers_count'],
m['statuses_count'],m['favourites_count']) for m in members['users']]
def tw_to_pol(twitter_api,slug,owner_screen_name,group):
"""Get members of a twitter list with known political group into a dataframe """
resp = twitter_api.lists.members(slug=slug,owner_screen_name=owner_screen_name,cursor=-1,count=5000)
members = get_members(resp)
df = pd.DataFrame(members,columns=header)
df['party'] = group
return df
def get_politicians():
"""Download 113th congress tweeps using public Twitter lists"""
header = ['id','screen_name','name','location','description','created_at',
'friends','followers','statuses','favorites']
polists = [{'slug':'senaterepublicans', 'owner_screen_name':'Senate_GOPs', 'group':'gop'}, #62
{'slug':'house-republicans', 'owner_screen_name':'HouseGOP', 'group':'gop'}, #260
{'slug':'housegop', 'owner_screen_name':'GOPLeader', 'group':'gop'}, #237
{'slug':'elected-democrats', 'owner_screen_name':'TheDemocrats', 'group':'dem'}, #259
{'slug':'house-democrats', 'owner_screen_name':'DannyMariachi', 'group':'dem'}, #188
{'slug':'senatedemocrats', 'owner_screen_name':'SenateDems', 'group':'dem'} #52
]
twitter_api = oauth_login()
df = pd.DataFrame(columns=header)
for polist in polists:
df = df.append(tw_to_pol(twitter_api,polist['slug'],polist['owner_screen_name'],polist['group']))
df = df.drop_duplicates()
df.to_csv('data/US-politicians.csv',encoding='utf-8',index=False)
return df
In [3]:
# get twitter IDs of congressmen and senators
df = pd.read_csv('data/US-politicians.csv',encoding='utf-8')
gop = df[df['party']=='gop']
dem = df[df['party']=='dem']
dem_tweeps = set(dem.screen_name.values)
gop_tweeps = set(gop.screen_name.values)
# Principal Accounts of Members of the U.S. Senate (a mix of campaign and government accounts)
senate = pd.read_csv('data/US-senate.csv',encoding='utf-8')
In [4]:
# get commentary tweets of US newsmakers and opinion-shapers
tweets = pd.read_csv('data/US-tweets.csv',encoding='utf-8',parse_dates=['dt'])
tweets.twhandle = tweets.twhandle.str[1:]
#tweets.dt = pd.to_datetime(tweets.dt,unit='D')
In [5]:
# print politician counts curated at least once by theplazz.com
title = tweets.groupby(by=['title','dt'])['twhandle']
print (len(title),'news commentated between',tweets.dt.order().iloc[0].strftime('%d-%b-%Y'),
'and',tweets.dt.order().iloc[-1].strftime('%d-%b-%Y'),'by')
tweepset = set(tweets.twhandle.unique())
senateset = set(senate.screen_name.values)
twcounts = pd.DataFrame(columns=['# of tweeps'])
twcounts.loc['senator'] = [len(senateset & tweepset)]
twcounts.loc['democrat'] = [len(dem_tweeps & tweepset)]
twcounts.loc['republican']= [len(gop_tweeps & tweepset)]
twcounts.loc['total'] = [len(tweepset)]
twcounts
Out[5]:
In [6]:
# plot commentating activity of these politicians
tweeps = tweets.groupby(by='twhandle')['twtext'].count().order(ascending=False)
poltweeps = tweeps[tweeps.index.isin(df.screen_name)]
colors = ['blue' if x in dem_tweeps else 'red' for x in poltweeps.index]
data = Data([Bar(
x=poltweeps.index,
y=poltweeps.values,
marker=Marker(color=colors)
)])
layout = Layout(yaxis=YAxis(title='# of news commentated (Jan 2013 - Jan 2015)'),
title="News counts commentated by 113th US Congress (curated by theplazz.com)")
fig = Figure(data=data, layout=layout)
py.iplot(fig,filename="113th US Congress as News Commentators")
Out[6]:
In [7]:
# Stats: how many news are commentated by how many democrats and/or republicans...
demnews = title.apply(lambda g: len(dem_tweeps & set(g.values)))
gopnews = title.apply(lambda g: len(gop_tweeps & set(g.values)))
print (demnews.sum(),'comments made on',demnews[demnews>0].size,'news by democrats.')
print (gopnews.sum(),'comments made on',gopnews[gopnews>0].size,'news by republicans.')
dgtotl = (demnews + gopnews)
print ('News commentated by any member of either group:',(dgtotl[dgtotl>0].size))
# Number of comments by dems - number of comments by gops
dgdiff = (demnews - gopnews)
# Normalize the polarity
dgdiv = dgdiff/dgtotl
digdiv = dgdiv.order()[:dgtotl[dgtotl>0].size]
print ('News commentated by democrats only:',(digdiv[digdiv == 1].size))
print ('News commentated by republicans only:',(digdiv[digdiv == -1].size))
print ('News commentated by both of the parties:',(digdiv[(digdiv > -1) & (digdiv < 1)].size))
In [8]:
# commentator group polarity distribution of news
digdiv[(digdiv > -1) & (digdiv < 1)].plot();
In [9]:
data = Data([Bar(
x=digdiv.index.get_level_values(0),
y=digdiv[(digdiv > -1) & (digdiv < 1)]
)])
layout = Layout(yaxis=YAxis(title='# of news commentated (Jan 2013 - Jan 2015)'),
margin=Margin(l=150,r=150,b=150),
title="News polarized by 113th US Congress (curated by theplazz.com)")
fig = Figure(data=data, layout=layout)
py.iplot(fig,filename="Polarity distribution of news")
Out[9]:
In [10]:
# Which news got the most attention by the politicians ?
dgtotl.order(ascending=False).head(60)
Out[10]:
In [11]:
# On which news the comment-count differences maximized?
dgdiff.order()
Out[11]:
In [19]:
#crate bipartite network for bpnet
G2=nx.Graph()
# add actors
for politician,color in list(zip(poltweeps.index.tolist(),colors)):
G2.add_node(politician, color=color,bipartite=0)
# add events
for e in dgtotl.index.tolist():
G2.add_node(e,bipartite=1)
In [20]:
def updateG2(group,G2,politicians):
""" Create two-mode edges """
actors = set(group.tolist()) #this can be extended, no weight on two-mode
for actor in actors:
if actor not in politicians:
continue
G2.add_edge(group.name, actor)
In [21]:
# add edges
title.apply(updateG2,G2,set(poltweeps.index))
# print number of nodes and edges
actors = events = 0
for n in G2.nodes(data=True):
if n[1]['bipartite']==0:
actors += 1
else:
events += 1
print ('actors:',actors,'\tevents:',events,'\tedges:',G2.number_of_edges())
In [23]:
"""The Network File is text file with a binary rectangular matrix.
The number of rows for the matrix should be the same as the number of Actors(A),
and the number of columns is the number of Actors(P)."""
actors = [n[0] for n in G2.nodes(data=True) if n[1]['bipartite']==0]
events = [n[0] for n in G2.nodes(data=True) if n[1]['bipartite']==1]
M = np.matrix(np.zeros((len(events),len(actors))))
for i,event in enumerate(events):
for j,actor in enumerate(actors):
if G2.has_edge(actor, event):
M[i,j]=1
np.savetxt("data/congress_2mode.txt", M, fmt='%d')
In [ ]:
#ERGM analysis:
"""
b2nodematch is a homophily based two-star statistic. This term adds one statistic to the model unless diff is set to TRUE, in which case the term adds multiple network statistics to the model, one for each of (a subset of) the unique values of the attrname attribute.
"""
R code (ERGM on the bipartite, two-mode network):
#two mode
commentaries <- read.table('data/congress_2mode.txt',sep=' ')
commentaries <- as.data.frame(t(commentaries))
parties <- read.table('data/congress_attributes.txt',sep=' ',header=T,stringsAsFactors=FALSE)
two_mode <-network(commentaries, vertex.attr=parties, matrix.type='bipartite',
directed=F, hyper=F, loops=F, multiple=F, bipartite=66)
set.vertex.attribute(two_mode, 'party', NA, v=seq_len(network.size(two_mode)-66)+66)
summary(two_mode, print.adj=F)
two_mode.diff<-ergm(two_mode~edges+b1nodematch("party",diff=T))
In [ ]:
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.921374 0.016658 2 <1e-04 ***
b1nodematch.party.D 0.520067 0.003541 2 <1e-04 ***
b1nodematch.party.R 0.261082 0.017989 2 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52601 on 486813 degrees of freedom
AIC: 52607 BIC: 52640 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.950125 0.023855 1 <1e-04 ***
b1nodematch.party.D 0.377780 0.023558 2 <1e-04 ***
b1nodematch.party.R 0.628042 0.003964 2 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52144 on 486813 degrees of freedom
AIC: 52150 BIC: 52183 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.920042 0.019739 1 <1e-04 ***
b2nodematch.party.D 0.358749 0.034140 1 <1e-04 ***
b2nodematch.party.R 0.601879 0.002838 3 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52254 on 486813 degrees of freedom
AIC: 52260 BIC: 52293 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.949630 0.028055 1 <1e-04 ***
b2nodematch.party.D 0.530463 0.003897 1 <1e-04 ***
b2nodematch.party.R 0.273482 0.020214 2 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52477 on 486813 degrees of freedom
AIC: 52483 BIC: 52516 (Smaller is better.)
In [ ]:
#when party types are not differentiated
two_mode_b<-ergm(two_mode~edges+b1nodematch("party"))
summary(two_mode_b)
==========================
Summary of model fit
==========================
Formula: two_mode ~ edges + b1nodematch("party")
Iterations: 20 out of 20
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.82148 0.02591 1 <1e-04 ***
b1nodematch.party 0.31572 0.02996 1 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52736 on 486814 degrees of freedom
AIC: 52740 BIC: 52762 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.86517 0.01552 2 <1e-04 ***
b2nodematch.party 0.34602 0.01707 2 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52590 on 486814 degrees of freedom
AIC: 52594 BIC: 52616 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.77669 0.02745 0 <1e-04 ***
b2nodematch.party 0.23968 0.02059 1 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 53149 on 486814 degrees of freedom
AIC: 53153 BIC: 53175 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.826963 0.018368 1 <1e-04 ***
b1nodematch.party 0.247133 0.007967 4 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 53158 on 486814 degrees of freedom
AIC: 53162 BIC: 53184 (Smaller is better.)
In [12]:
# let's create the actor network
# vertices <- commentators
# edges (weighted) <- number of news commentated by vertices incident to the edge
G=nx.Graph()
# add vertices
for politician,color in list(zip(poltweeps.index.tolist(),colors)):
G.add_node(politician, color=color)
# print(G.nodes(data=True))
In [13]:
def updateG(group,G,politicians):
""" Create weighted edges """
edges = itertools.combinations(group.tolist(), 2)
for v1,v2 in edges:
if v1 not in politicians or v2 not in politicians:
continue
if G.has_edge(v1, v2):
G[v1][v2]['weight'] += 1
else:
G.add_edge(v1, v2, weight=1)
In [16]:
# add edges
title.apply(updateG,G,set(poltweeps.index))
# print number of nodes and edges
print (G.number_of_nodes(),G.number_of_edges())
In [17]:
# Exporting to be read by Gephi for better visualization
# nx.write_gml(G,"data/theplazz_politics.gml")
# export for R-ergm
A = nx.to_numpy_matrix(G, weight='weight')
np.savetxt("data/congress_actors_weighted.txt", A, fmt='%d')
In [15]:
# network file for pnet
A = nx.to_numpy_matrix(G, weight=None)
np.savetxt("data/congress_actor.txt", A, fmt='%d')
In [22]:
#attribute file for pnet
party = [str(2) if n[1]['color']=='red' else str(1) for n in G.nodes(data=True)]
party.insert(0,'party')
with open('data/congress_attribute.txt','w') as w:
w.write('\n'.join(party))
In [28]:
#attribute file for R-ergm
party = ['R' if n[1]['color']=='red' else 'D' for n in G.nodes(data=True)]
party.insert(0,'party')
with open('data/congress_attributes.txt','w') as w:
w.write('\n'.join(party))
In [23]:
# 62 of the 65 monitored Congress members are found to be in the same group as their co-party members
# 3 congresspeople not in the same group as their co-party members are circled
from IPython.display import Image
Image(url='http://talhaoz.com/wp-content/uploads/2015/03/Modularity_Labeled.png')
Out[23]: